Creative Commons License
MSGSU ISTATISTIK BOLUMU - R ILE ISTATISTIKSEL PROGRAMLAMA DERS NOTLARI by is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License.

Hata ve öneriler için ozge.ozdamar@msgsu.edu.tr


REFERENCES


LIBRARIES

rm(list=ls())
.packages = c("car","doBy","lubridate","VIM","mice", "Amelia","naniar","plyr","sqldf", "dplyr", "lubridate")
.inst <- .packages %in% installed.packages()
if(length(.packages[!.inst]) > 0) install.packages(.packages[!.inst])
lapply(.packages, require, character.only=TRUE)

FUNCTIONS

car::recode()
doBy::recodevar()
base::within(), cut(), 
plyr::revalue(), mapvalues()

data

 data(mtcars)
 data(airquality)

1 Create new variable

veri<-data.frame(x1 = c(2, 2, 6, 4), x2 = c(3, 4, 2, 8))
veri
veri$sumx  <-  veri$x1 + veri$x2
veri$meanx <- (veri$x1 + veri$x2)/2
veri
attach(veri)
veri$sumx  <-  x1 + x2
veri$meanx <- (x1 + x2)/2
detach(veri)
veri <- base::transform(veri, sumx  =  x1 + x2,  meanx = (x1 + x2)/2)
veri

2 Recoding variables

Recoding involves creating new values of a variable conditional on the existing values of the same and/or other variables.

  • Change a continuous variable into a set of categories
  • Replace miscoded values with correct values
  • Create a pass/fail variable based on a set of cutoff scores
  • The statement variable[condition] <- expression will only make the assignment when condition is TRUE.
head(mtcars)
rank(mtcars$mpg)
##  [1] 19.5 19.5 24.5 21.5 15.0 14.0  4.0 26.0 24.5 16.5 13.0 11.0 12.0  7.5  1.5
## [16]  1.5  5.0 31.0 29.5 32.0 23.0  9.0  7.5  3.0 16.5 28.0 27.0 29.5 10.0 18.0
## [31]  6.0 21.5
mtcars$mpgcat1[mtcars$mpg  < 10] <-"A"
mtcars$mpgcat1[mtcars$mpg  >= 10  & mtcars$mpg <= 20] <-"B"
mtcars$mpgcat1[mtcars$mpg  > 20] <-"C"                    
mtcars
str(mtcars)
## 'data.frame':    32 obs. of  12 variables:
##  $ mpg    : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl    : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp   : num  160 160 108 258 360 ...
##  $ hp     : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat   : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt     : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec   : num  16.5 17 18.6 19.4 17 ...
##  $ vs     : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am     : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear   : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb   : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ mpgcat1: chr  "C" "C" "C" "C" ...
mtcars$mpgcat1 <- as.factor(mtcars$mpgcat1)
str(mtcars)
## 'data.frame':    32 obs. of  12 variables:
##  $ mpg    : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl    : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp   : num  160 160 108 258 360 ...
##  $ hp     : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat   : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt     : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec   : num  16.5 17 18.6 19.4 17 ...
##  $ vs     : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am     : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear   : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb   : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...

2.0.1 within

mtcars<-within(mtcars,{
             mpgcat2 <- NA
             mpgcat2[mpg < 10] <- "A"
             mpgcat2[mpg >= 10 & mpg <= 20] <- "B"
             mpgcat2[mpg >20] <- "C"  })
mtcars
str(mtcars)
## 'data.frame':    32 obs. of  13 variables:
##  $ mpg    : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl    : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp   : num  160 160 108 258 360 ...
##  $ hp     : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat   : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt     : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec   : num  16.5 17 18.6 19.4 17 ...
##  $ vs     : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am     : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear   : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb   : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
##  $ mpgcat2: chr  "C" "C" "C" "C" ...
mtcars$mpgcat2 <- as.factor(mtcars$mpgcat2)
str(mtcars)
## 'data.frame':    32 obs. of  13 variables:
##  $ mpg    : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl    : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp   : num  160 160 108 258 360 ...
##  $ hp     : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat   : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt     : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec   : num  16.5 17 18.6 19.4 17 ...
##  $ vs     : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am     : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear   : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb   : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
##  $ mpgcat2: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...

2.0.2 car::recode()

a1<-car::recode(mtcars$carb,"1=10;2=20;3=30;4=40;6=60;8=80")
head(cbind(mtcars$carb,a1),10)
##         a1
##  [1,] 4 40
##  [2,] 4 40
##  [3,] 1 10
##  [4,] 1 10
##  [5,] 2 20
##  [6,] 1 10
##  [7,] 4 40
##  [8,] 2 20
##  [9,] 2 20
## [10,] 4 40
# ?recode
# http://rprogramming.net/recode-data-in-r/

2.0.3 doBy::recodeVar()

mtcars$carb2<-doBy::recodeVar(mtcars$carb,src=c(1:8),tgt=c("A","B","C","D","E","F","G","H"))
head(cbind(mtcars$carb2),10) 
##       [,1]
##  [1,] "D" 
##  [2,] "D" 
##  [3,] "A" 
##  [4,] "A" 
##  [5,] "B" 
##  [6,] "A" 
##  [7,] "D" 
##  [8,] "B" 
##  [9,] "B" 
## [10,] "D"
str(mtcars) # carb2 char
## 'data.frame':    32 obs. of  14 variables:
##  $ mpg    : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl    : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp   : num  160 160 108 258 360 ...
##  $ hp     : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat   : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt     : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec   : num  16.5 17 18.6 19.4 17 ...
##  $ vs     : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am     : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear   : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb   : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
##  $ mpgcat2: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
##  $ carb2  : chr  "D" "D" "A" "A" ...
mtcars$carb2<- as.factor(mtcars$carb2)
str(mtcars)
## 'data.frame':    32 obs. of  14 variables:
##  $ mpg    : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl    : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp   : num  160 160 108 258 360 ...
##  $ hp     : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat   : num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt     : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec   : num  16.5 17 18.6 19.4 17 ...
##  $ vs     : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am     : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear   : num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb   : num  4 4 1 1 2 1 4 2 2 4 ...
##  $ mpgcat1: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
##  $ mpgcat2: Factor w/ 2 levels "B","C": 2 2 2 2 1 1 1 2 2 1 ...
##  $ carb2  : Factor w/ 6 levels "A","B","C","D",..: 4 4 1 1 2 1 4 2 2 4 ...
# ? recodeVar

2.0.4 base::cut()

mtcars$mpgcat3<-cut(mtcars$mpg,
                    breaks=c(-Inf,20,30,Inf),
                    labels = c("A","B","C"))
head(mtcars)

###plyr::revalue(), mapvalues()

mtcars$carb3<-plyr::revalue(as.factor(mtcars$carb),c("1"="A","2"="B","3"="C","4"="D","6"="E"))
mtcars$carb4<-plyr::mapvalues(as.factor(mtcars$carb),from = c("1","2","3","4","6"),to = c("A","B","C","D","E"))
head(mtcars)

3 Renaming variables

fix(mtcars)
edit(mtcars)
data.entry(mtcars)
reshape::rename()
# rename(mtcars, c(wt = "weight", cyl = "cylinders"))
names(mtcars)
##  [1] "mpg"     "cyl"     "disp"    "hp"      "drat"    "wt"      "qsec"   
##  [8] "vs"      "am"      "gear"    "carb"    "mpgcat1" "mpgcat2" "carb2"  
## [15] "mpgcat3" "carb3"   "carb4"

4 Missing values

NA Not Available

x <- c(1, 99, 3, NA, 5, 5, NA, 99, 3, 3, NA, 1, 3, 5, 1, 1 )
is.na(x)
##  [1] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
## [13] FALSE FALSE FALSE FALSE
x < 3
##  [1]  TRUE FALSE FALSE    NA FALSE FALSE    NA FALSE FALSE FALSE    NA  TRUE
## [13] FALSE FALSE  TRUE  TRUE
x == 99
##  [1] FALSE  TRUE FALSE    NA FALSE FALSE    NA  TRUE FALSE FALSE    NA FALSE
## [13] FALSE FALSE FALSE FALSE

NA cannot be used in comparisons

x == NA 
##  [1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

sample data

data(airquality)
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
is.na(airquality)
##        Ozone Solar.R  Wind  Temp Month   Day
##   [1,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [2,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [3,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [4,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [5,]  TRUE    TRUE FALSE FALSE FALSE FALSE
##   [6,] FALSE    TRUE FALSE FALSE FALSE FALSE
##   [7,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [8,] FALSE   FALSE FALSE FALSE FALSE FALSE
##   [9,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [10,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [11,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [12,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [13,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [14,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [15,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [16,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [17,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [18,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [19,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [20,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [21,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [22,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [23,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [24,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [25,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [26,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [27,]  TRUE    TRUE FALSE FALSE FALSE FALSE
##  [28,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [29,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [30,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [31,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [32,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [33,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [34,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [35,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [36,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [37,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [38,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [39,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [40,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [41,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [42,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [43,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [44,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [45,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [46,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [47,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [48,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [49,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [50,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [51,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [52,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [53,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [54,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [55,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [56,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [57,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [58,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [59,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [60,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [61,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [62,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [63,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [64,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [65,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [66,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [67,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [68,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [69,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [70,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [71,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [72,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [73,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [74,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [75,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [76,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [77,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [78,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [79,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [80,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [81,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [82,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [83,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [84,]  TRUE   FALSE FALSE FALSE FALSE FALSE
##  [85,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [86,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [87,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [88,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [89,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [90,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [91,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [92,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [93,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [94,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [95,] FALSE   FALSE FALSE FALSE FALSE FALSE
##  [96,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [97,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [98,] FALSE    TRUE FALSE FALSE FALSE FALSE
##  [99,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [100,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [101,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [102,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [103,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [104,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [105,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [106,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [107,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [108,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [109,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [110,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [111,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [112,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [113,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [114,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [115,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [116,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [117,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [118,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [119,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [120,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [121,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [122,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [123,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [124,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [125,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [126,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [127,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [128,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [129,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [130,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [131,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [132,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [133,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [134,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [135,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [136,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [137,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [138,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [139,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [140,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [141,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [142,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [143,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [144,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [145,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [146,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [147,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [148,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [149,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [150,]  TRUE   FALSE FALSE FALSE FALSE FALSE
## [151,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [152,] FALSE   FALSE FALSE FALSE FALSE FALSE
## [153,] FALSE   FALSE FALSE FALSE FALSE FALSE

NA + anything = NA

NaN : Not a Number any_na(NaN) any_na(NULL) any_na(Inf)

NA | TRUE
## [1] TRUE
NA | FALSE
## [1] NA
NA + NaN
## [1] NA
NaN + NA
## [1] NA

4.1 Recoding values to missing

x[x == 99] <- NA
x
##  [1]  1 NA  3 NA  5  5 NA NA  3  3 NA  1  3  5  1  1

4.2 Missing data in arguments

na.omit, na.exclude, na.pass, na.fail, na.action, na.rm, na.last, useNA
mean(airquality$Ozone)
## [1] NA
mean(airquality$Ozone,na.rm=TRUE)
## [1] 42.12931
mean(na.omit(airquality$Ozone))
## [1] 42.12931
na.omit(airquality$Ozone)
##   [1]  41  36  12  18  28  23  19   8   7  16  11  14  18  14  34   6  30  11
##  [19]   1  11   4  32  23  45 115  37  29  71  39  23  21  37  20  12  13 135
##  [37]  49  32  64  40  77  97  97  85  10  27   7  48  35  61  79  63  16  80
##  [55] 108  20  52  82  50  64  59  39   9  16  78  35  66 122  89 110  44  28
##  [73]  65  22  59  23  31  44  21   9  45 168  73  76 118  84  85  96  78  73
##  [91]  91  47  32  20  23  21  24  44  21  28   9  13  46  18  13  24  16  13
## [109]  23  36   7  14  30  14  18  20
## attr(,"na.action")
##  [1]   5  10  25  26  27  32  33  34  35  36  37  39  42  43  45  46  52  53  54
## [20]  55  56  57  58  59  60  61  65  72  75  83  84 102 103 107 115 119 150
## attr(,"class")
## [1] "omit"
airquality$Ozone
##   [1]  41  36  12  18  NA  28  23  19   8  NA   7  16  11  14  18  14  34   6
##  [19]  30  11   1  11   4  32  NA  NA  NA  23  45 115  37  NA  NA  NA  NA  NA
##  [37]  NA  29  NA  71  39  NA  NA  23  NA  NA  21  37  20  12  13  NA  NA  NA
##  [55]  NA  NA  NA  NA  NA  NA  NA 135  49  32  NA  64  40  77  97  97  85  NA
##  [73]  10  27  NA   7  48  35  61  79  63  16  NA  NA  80 108  20  52  82  50
##  [91]  64  59  39   9  16  78  35  66 122  89 110  NA  NA  44  28  65  NA  22
## [109]  59  23  31  44  21   9  NA  45 168  73  NA  76 118  84  85  96  78  73
## [127]  91  47  32  20  23  21  24  44  21  28   9  13  46  18  13  24  16  13
## [145]  23  36   7  14  30  NA  14  18  20
na.omit(airquality)

returns the object with observations removed if they contain any missing values

na.exclude(airquality) 

differences between omitting and excluding NAs can be seen in some prediction and residual functions

na.fail(airquality)

returns the object only if it contains no missing values

na.pass(airquality) 

returns the object unchanged

4.2.1 omit vs exclude

omit.model<- lm(Ozone ~ Solar.R, data = airquality, na.action = na.omit)
exclude.model<- lm(Ozone ~ Solar.R, data = airquality, na.action = na.exclude)
omit.model
## 
## Call:
## lm(formula = Ozone ~ Solar.R, data = airquality, na.action = na.omit)
## 
## Coefficients:
## (Intercept)      Solar.R  
##     18.5987       0.1272
exclude.model
## 
## Call:
## lm(formula = Ozone ~ Solar.R, data = airquality, na.action = na.exclude)
## 
## Coefficients:
## (Intercept)      Solar.R  
##     18.5987       0.1272
resid(omit.model)
##           1           2           3           4           7           8 
##  -1.7601294   2.3957702 -25.5463532 -40.4014578 -33.6211440 -12.1880897 
##           9          12          13          14          15          16 
## -13.0148679 -35.1530373 -44.4766565 -39.4420122  -8.8644704 -47.0719285 
##          17          18          19          20          21          22 
## -23.6384662 -22.5176190 -29.5459452 -13.1939997 -18.6160499 -48.2916147 
##          23          24          28          29          30          31 
## -17.7778596   1.7020672   2.7481237  -5.6443762  68.0434167 -17.0778386 
##          38          40          41          44          47          48 
##  -5.7487173  15.3961782 -20.6731105 -14.4191880 -21.8872947 -17.7136649 
##          49          50          51          62          63          64 
##  -3.3038428 -21.8585604 -23.0203700  82.1938142  -1.1357151 -16.6097319 
##          66          67          68          69          70          71 
##  23.1473497 -18.5286231  23.3036573  44.4481447  43.8123183  44.1473497 
##          73          74          76          77          78          79 
## -42.1703595 -13.8526503 -17.7026608  -3.6616984 -18.4420122   6.1591698 
##          80          81          82          85          86          87 
##  36.6213664  16.4249125  -3.4888847  24.0146824  61.0434167  -8.8991148 
##          88          89          90          91          92          93 
##  22.9737200  36.3150694  -3.5691775  13.2284585   8.1012932   9.8465547 
##          94          95          99         100         101         104 
## -12.6506943 -12.3904537  70.9741280  41.2804250  65.0780610   0.9855401 
##         105         106         108         109         110         111 
## -25.3148469  26.4363246  -5.6274621  33.9158434 -10.2227340 -18.6270541 
##         112         113         114         116         117         118 
##   1.2398706 -30.5345331 -14.1766776  -0.5577654 119.1359376  27.0607388 
##         120         121         122         123         124         125 
##  31.5867221  70.7890861  35.2631028  42.4942012  56.1646719  34.3497137 
##         126         127         128         129         130         131 
##  31.1300275  48.3670359  16.3205714   1.7020672 -30.6443762 -23.5750875 
##         132         133         134         135         136         137 
## -26.8467403 -27.5345331  -4.6097319 -30.5345331 -20.8640624 -12.6506943 
##         138         139         140         141         142         143 
## -19.8412382  -2.7368972 -29.0837486  -9.0321901 -24.8640624 -28.1589474 
##         144         145         146         147         148         149 
## -35.8640624   2.6209584  -0.2747005 -17.8298261  -7.1420332 -13.1416252 
##         151         152         153 
## -28.8872947 -17.2573784 -26.9565833
resid(exclude.model)
##           1           2           3           4           5           6 
##  -1.7601294   2.3957702 -25.5463532 -40.4014578          NA          NA 
##           7           8           9          10          11          12 
## -33.6211440 -12.1880897 -13.0148679          NA          NA -35.1530373 
##          13          14          15          16          17          18 
## -44.4766565 -39.4420122  -8.8644704 -47.0719285 -23.6384662 -22.5176190 
##          19          20          21          22          23          24 
## -29.5459452 -13.1939997 -18.6160499 -48.2916147 -17.7778596   1.7020672 
##          25          26          27          28          29          30 
##          NA          NA          NA   2.7481237  -5.6443762  68.0434167 
##          31          32          33          34          35          36 
## -17.0778386          NA          NA          NA          NA          NA 
##          37          38          39          40          41          42 
##          NA  -5.7487173          NA  15.3961782 -20.6731105          NA 
##          43          44          45          46          47          48 
##          NA -14.4191880          NA          NA -21.8872947 -17.7136649 
##          49          50          51          52          53          54 
##  -3.3038428 -21.8585604 -23.0203700          NA          NA          NA 
##          55          56          57          58          59          60 
##          NA          NA          NA          NA          NA          NA 
##          61          62          63          64          65          66 
##          NA  82.1938142  -1.1357151 -16.6097319          NA  23.1473497 
##          67          68          69          70          71          72 
## -18.5286231  23.3036573  44.4481447  43.8123183  44.1473497          NA 
##          73          74          75          76          77          78 
## -42.1703595 -13.8526503          NA -17.7026608  -3.6616984 -18.4420122 
##          79          80          81          82          83          84 
##   6.1591698  36.6213664  16.4249125  -3.4888847          NA          NA 
##          85          86          87          88          89          90 
##  24.0146824  61.0434167  -8.8991148  22.9737200  36.3150694  -3.5691775 
##          91          92          93          94          95          96 
##  13.2284585   8.1012932   9.8465547 -12.6506943 -12.3904537          NA 
##          97          98          99         100         101         102 
##          NA          NA  70.9741280  41.2804250  65.0780610          NA 
##         103         104         105         106         107         108 
##          NA   0.9855401 -25.3148469  26.4363246          NA  -5.6274621 
##         109         110         111         112         113         114 
##  33.9158434 -10.2227340 -18.6270541   1.2398706 -30.5345331 -14.1766776 
##         115         116         117         118         119         120 
##          NA  -0.5577654 119.1359376  27.0607388          NA  31.5867221 
##         121         122         123         124         125         126 
##  70.7890861  35.2631028  42.4942012  56.1646719  34.3497137  31.1300275 
##         127         128         129         130         131         132 
##  48.3670359  16.3205714   1.7020672 -30.6443762 -23.5750875 -26.8467403 
##         133         134         135         136         137         138 
## -27.5345331  -4.6097319 -30.5345331 -20.8640624 -12.6506943 -19.8412382 
##         139         140         141         142         143         144 
##  -2.7368972 -29.0837486  -9.0321901 -24.8640624 -28.1589474 -35.8640624 
##         145         146         147         148         149         150 
##   2.6209584  -0.2747005 -17.8298261  -7.1420332 -13.1416252          NA 
##         151         152         153 
## -28.8872947 -17.2573784 -26.9565833
data.frame(resid(omit.model),resid(exclude.model)) # error
fitted(omit.model)
##        1        2        3        4        7        8        9       12 
## 42.76013 33.60423 37.54635 58.40146 56.62114 31.18809 21.01487 51.15304 
##       13       14       15       16       17       18       19       20 
## 55.47666 53.44201 26.86447 61.07193 57.63847 28.51762 59.54595 24.19400 
##       21       22       23       24       28       29       30       31 
## 19.61605 59.29161 21.77786 30.29793 20.25188 50.64438 46.95658 54.07784 
##       38       40       41       44       47       48       49       50 
## 34.74872 55.60382 59.67311 37.41919 42.88729 54.71366 23.30384 33.85856 
##       51       62       63       64       66       67       68       69 
## 36.02037 52.80619 50.13572 48.60973 40.85265 58.52862 53.69634 52.55186 
##       70       71       73       74       76       77       78       79 
## 53.18768 40.85265 52.17036 40.85265 24.70266 51.66170 53.44201 54.84083 
##       80       81       82       85       86       87       88       89 
## 42.37863 46.57509 19.48888 55.98532 46.95658 28.89911 29.02628 45.68493 
##       90       91       92       93       94       95       99      100 
## 53.56918 50.77154 50.89871 29.15345 21.65069 28.39045 51.02587 47.71957 
##      101      104      105      106      108      109      110      111 
## 44.92194 43.01446 53.31485 38.56368 27.62746 25.08416 33.22273 49.62705 
##      112      113      114      116      117      118      120      121 
## 42.76013 51.53453 23.17668 45.55777 48.86406 45.93926 44.41328 47.21091 
##      122      123      124      125      126      127      128      129 
## 48.73690 42.50580 39.83533 43.65029 41.86997 42.63296 30.67943 30.29793 
##      130      131      132      133      134      135      136      137 
## 50.64438 46.57509 47.84674 51.53453 48.60973 51.53453 48.86406 21.65069 
##      138      139      140      141      142      143      144      145 
## 32.84124 48.73690 47.08375 22.03219 48.86406 44.15895 48.86406 20.37904 
##      146      147      148      149      151      152      153 
## 36.27470 24.82983 21.14203 43.14163 42.88729 35.25738 46.95658
fitted(exclude.model)
##        1        2        3        4        5        6        7        8 
## 42.76013 33.60423 37.54635 58.40146       NA       NA 56.62114 31.18809 
##        9       10       11       12       13       14       15       16 
## 21.01487       NA       NA 51.15304 55.47666 53.44201 26.86447 61.07193 
##       17       18       19       20       21       22       23       24 
## 57.63847 28.51762 59.54595 24.19400 19.61605 59.29161 21.77786 30.29793 
##       25       26       27       28       29       30       31       32 
##       NA       NA       NA 20.25188 50.64438 46.95658 54.07784       NA 
##       33       34       35       36       37       38       39       40 
##       NA       NA       NA       NA       NA 34.74872       NA 55.60382 
##       41       42       43       44       45       46       47       48 
## 59.67311       NA       NA 37.41919       NA       NA 42.88729 54.71366 
##       49       50       51       52       53       54       55       56 
## 23.30384 33.85856 36.02037       NA       NA       NA       NA       NA 
##       57       58       59       60       61       62       63       64 
##       NA       NA       NA       NA       NA 52.80619 50.13572 48.60973 
##       65       66       67       68       69       70       71       72 
##       NA 40.85265 58.52862 53.69634 52.55186 53.18768 40.85265       NA 
##       73       74       75       76       77       78       79       80 
## 52.17036 40.85265       NA 24.70266 51.66170 53.44201 54.84083 42.37863 
##       81       82       83       84       85       86       87       88 
## 46.57509 19.48888       NA       NA 55.98532 46.95658 28.89911 29.02628 
##       89       90       91       92       93       94       95       96 
## 45.68493 53.56918 50.77154 50.89871 29.15345 21.65069 28.39045       NA 
##       97       98       99      100      101      102      103      104 
##       NA       NA 51.02587 47.71957 44.92194       NA       NA 43.01446 
##      105      106      107      108      109      110      111      112 
## 53.31485 38.56368       NA 27.62746 25.08416 33.22273 49.62705 42.76013 
##      113      114      115      116      117      118      119      120 
## 51.53453 23.17668       NA 45.55777 48.86406 45.93926       NA 44.41328 
##      121      122      123      124      125      126      127      128 
## 47.21091 48.73690 42.50580 39.83533 43.65029 41.86997 42.63296 30.67943 
##      129      130      131      132      133      134      135      136 
## 30.29793 50.64438 46.57509 47.84674 51.53453 48.60973 51.53453 48.86406 
##      137      138      139      140      141      142      143      144 
## 21.65069 32.84124 48.73690 47.08375 22.03219 48.86406 44.15895 48.86406 
##      145      146      147      148      149      150      151      152 
## 20.37904 36.27470 24.82983 21.14203 43.14163       NA 42.88729 35.25738 
##      153 
## 46.95658

na.omit and na.exclude do not use the missing values, but maintains their position for the residuals and fitted values.

summary(airquality$Ozone)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   18.00   31.50   42.13   63.25  168.00      37
table(airquality$Ozone)
## 
##   1   4   6   7   8   9  10  11  12  13  14  16  18  19  20  21  22  23  24  27 
##   1   1   1   3   1   3   1   3   2   4   4   4   4   1   4   4   1   6   2   1 
##  28  29  30  31  32  34  35  36  37  39  40  41  44  45  46  47  48  49  50  52 
##   3   1   2   1   3   1   2   2   2   2   1   1   3   2   1   1   1   1   1   1 
##  59  61  63  64  65  66  71  73  76  77  78  79  80  82  84  85  89  91  96  97 
##   2   1   1   2   1   1   1   2   1   1   2   1   1   1   1   2   1   1   1   2 
## 108 110 115 118 122 135 168 
##   1   1   1   1   1   1   1
table(airquality$Ozone,useNA="ifany")
## 
##    1    4    6    7    8    9   10   11   12   13   14   16   18   19   20   21 
##    1    1    1    3    1    3    1    3    2    4    4    4    4    1    4    4 
##   22   23   24   27   28   29   30   31   32   34   35   36   37   39   40   41 
##    1    6    2    1    3    1    2    1    3    1    2    2    2    2    1    1 
##   44   45   46   47   48   49   50   52   59   61   63   64   65   66   71   73 
##    3    2    1    1    1    1    1    1    2    1    1    2    1    1    1    2 
##   76   77   78   79   80   82   84   85   89   91   96   97  108  110  115  118 
##    1    1    2    1    1    1    1    2    1    1    1    2    1    1    1    1 
##  122  135  168 <NA> 
##    1    1    1   37
table(airquality$Ozone, useNA="always")
## 
##    1    4    6    7    8    9   10   11   12   13   14   16   18   19   20   21 
##    1    1    1    3    1    3    1    3    2    4    4    4    4    1    4    4 
##   22   23   24   27   28   29   30   31   32   34   35   36   37   39   40   41 
##    1    6    2    1    3    1    2    1    3    1    2    2    2    2    1    1 
##   44   45   46   47   48   49   50   52   59   61   63   64   65   66   71   73 
##    3    2    1    1    1    1    1    1    2    1    1    2    1    1    1    2 
##   76   77   78   79   80   82   84   85   89   91   96   97  108  110  115  118 
##    1    1    2    1    1    1    1    2    1    1    1    2    1    1    1    1 
##  122  135  168 <NA> 
##    1    1    1   37
length(airquality$Ozone)
## [1] 153
x1 <- sort(airquality$Ozone)
x1
##   [1]   1   4   6   7   7   7   8   9   9   9  10  11  11  11  12  12  13  13
##  [19]  13  13  14  14  14  14  16  16  16  16  18  18  18  18  19  20  20  20
##  [37]  20  21  21  21  21  22  23  23  23  23  23  23  24  24  27  28  28  28
##  [55]  29  30  30  31  32  32  32  34  35  35  36  36  37  37  39  39  40  41
##  [73]  44  44  44  45  45  46  47  48  49  50  52  59  59  61  63  64  64  65
##  [91]  66  71  73  73  76  77  78  78  79  80  82  84  85  85  89  91  96  97
## [109]  97 108 110 115 118 122 135 168
length(x1)
## [1] 116
x2 <- sort(airquality$Ozone, na.last = TRUE)
x2
##   [1]   1   4   6   7   7   7   8   9   9   9  10  11  11  11  12  12  13  13
##  [19]  13  13  14  14  14  14  16  16  16  16  18  18  18  18  19  20  20  20
##  [37]  20  21  21  21  21  22  23  23  23  23  23  23  24  24  27  28  28  28
##  [55]  29  30  30  31  32  32  32  34  35  35  36  36  37  37  39  39  40  41
##  [73]  44  44  44  45  45  46  47  48  49  50  52  59  59  61  63  64  64  65
##  [91]  66  71  73  73  76  77  78  78  79  80  82  84  85  85  89  91  96  97
## [109]  97 108 110 115 118 122 135 168  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## [127]  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
## [145]  NA  NA  NA  NA  NA  NA  NA  NA  NA
length(x2)
## [1] 153

4.3 detect, pattern, visualization

complete.cases, VIM::countNA(), mice::md.pattern()
x
##  [1]  1 NA  3 NA  5  5 NA NA  3  3 NA  1  3  5  1  1
complete.cases(x) # tam satirlar
##  [1]  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE
## [13]  TRUE  TRUE  TRUE  TRUE
is.na(x)
##  [1] FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE
## [13] FALSE FALSE FALSE FALSE
!complete.cases(x)  
##  [1] FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE
## [13] FALSE FALSE FALSE FALSE
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
dim(airquality)
## [1] 153   6

missing data olmayan satirlar

airquality[complete.cases(airquality), ]
dim(airquality[complete.cases(airquality), ])
## [1] 111   6

en az bir missing data olan satirlar

airquality[!complete.cases(airquality), ]
dim(airquality[!complete.cases(airquality), ])
## [1] 42  6

mantiksal operatorlerin sayisal degerleri 1 ve 0 oldugundan,

sum(is.na(airquality$Ozone)) # Dream degiskenindeki bos gozlem sayisi 
## [1] 37
VIM::countNA(airquality$Ozone) # VIM paketi ile
## [1] 37
mean(is.na(sleep$Dream)) # Dream degiskenindeki bos gozlem orani
## [1] NaN
mean(!complete.cases(sleep)) # veri setinde en az bir bos gozlem olan satir orani
## [1] 0

missing data pattern

mice::md.pattern(airquality) # mice paketi

##     Wind Temp Month Day Solar.R Ozone   
## 111    1    1     1   1       1     1  0
## 35     1    1     1   1       1     0  1
## 5      1    1     1   1       0     1  1
## 2      1    1     1   1       0     0  2
##        0    0     0   0       7    37 44

** missing data visualization**

VIM package

a<-VIM::aggr(airquality, prop=FALSE, numbers=TRUE) # VIM paketi

summary(a) # missing data oruntusu
## 
##  Missings per variable: 
##  Variable Count
##     Ozone    37
##   Solar.R     7
##      Wind     0
##      Temp     0
##     Month     0
##       Day     0
## 
##  Missings in combinations of variables: 
##  Combinations Count   Percent
##   0:0:0:0:0:0   111 72.549020
##   0:1:0:0:0:0     5  3.267974
##   1:0:0:0:0:0    35 22.875817
##   1:1:0:0:0:0     2  1.307190
VIM::matrixplot(airquality)

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
VIM::matrixplot(airquality, interactive = TRUE, sortby = "Ozone") #!

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
VIM::marginplot(airquality[,c("Ozone","Solar.R")])

VIM::marginplot(airquality[c("Ozone","Solar.R")], pch=c(20),col=c("darkgray", "red", "blue"))

VIM::marginmatrix(airquality[,-5])

VIM::barMiss(airquality[,c("Month","Ozone")])

## 
## Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
airquality[,c("Month","Ozone")] # grafik verisini gormek icin
plot(airquality$Ozone, airquality$Solar.R)
VIM::rugNA(airquality$Ozone, airquality$Solar.R,side=1) # y eksenindeki missingleri x ekseni uzerinde gosteriyor
VIM::rugNA(airquality$Ozone, airquality$Solar.R,ticksize = 1, col= "orange", side=2) # miss argumani ekleyerek missing yerine imputed veriler gosterilebilir ?rugNA bakiniz

VIM::scattmatrixMiss(airquality) # all variables highlighted, delimiter argumani imputed degerler icin kullanilir, ?scattmatrixMiss

## 
## Click in a diagonal panel to add to or remove from the highlight selection.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
## 
## Highlighted missings in any of the variables 'Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day'.
VIM::scattmatrixMiss(airquality, highlight = "Ozone")

## 
## Click in a diagonal panel to add to or remove from the highlight selection.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
## 
## Highlighted 'missings' in variable 'Ozone'.
VIM::pbox(airquality) # parallel boxplots obs ve miss icin
## Warning in createPlot(main, sub, xlab, ylab, labels, ca$at): not enough space
## to display frequencies

## 
## Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
VIM::parcoordMiss(airquality) #paralel koordinatlar grafigi

## 
## Click on a coordinate axis to add to or remove from the highlight selection.
## Click in the top margin to toggle visualizing missing  values in the plot variables.
## To regain use of the VIM GUI and the R console, click in any of the other plot margins.
## 
## Highlighted missings in any of the variables 'Ozone', 'Solar.R', 'Wind', 'Temp', 'Month', 'Day'.
VIM::scattJitt(airquality[,1:2]) # Ozone ve Solar.R icin jittered scatterplot

plot(airquality[,1:2])

VIM::spineMiss(airquality[,c("Month","Solar.R")]) # spineplot /spinogram

## 
## Click in in the left margin to switch to the previous variable or in the right margin to switch to the next variable.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
VIM::scattMiss(airquality[,c("Ozone","Solar.R")]) # missing data line ile gosterilir

## 
## Click in bottom or left margin to change the 'side' argument accordingly.
## To regain use of the VIM GUI and the R console, click anywhere else in the graphics window.
VIM::mosaicMiss(airquality, high = 4, plotvars = 5:6) # mosaic plot

Amelia paketi ile gorsellestirme

Amelia::missmap(airquality)

5 Date values

Bugun <- Sys.Date()
Bugun
## [1] "2024-01-01"
class(Bugun)
## [1] "Date"
typeof(Bugun)
## [1] "double"
mode(Bugun)
## [1] "numeric"
Bu.an<-Sys.time()
Bu.an
## [1] "2024-01-01 19:49:48 +03"
class(Bu.an)
## [1] "POSIXct" "POSIXt"
typeof(Bu.an)
## [1] "double"
mode(Bu.an)
## [1] "numeric"
date()
## [1] "Mon Jan  1 19:49:48 2024"

YYYY-MM-DD

yarin <- as.Date("2018-05-05")
yarin
## [1] "2018-05-05"
weekdays(Bugun)
## [1] "Monday"
haftaya<-Bugun+7
haftaya
## [1] "2024-01-08"
x1<-1:15
Bugun + x1
##  [1] "2024-01-02" "2024-01-03" "2024-01-04" "2024-01-05" "2024-01-06"
##  [6] "2024-01-07" "2024-01-08" "2024-01-09" "2024-01-10" "2024-01-11"
## [11] "2024-01-12" "2024-01-13" "2024-01-14" "2024-01-15" "2024-01-16"
seq(Bugun, by=2, length.out = 15)
##  [1] "2024-01-01" "2024-01-03" "2024-01-05" "2024-01-07" "2024-01-09"
##  [6] "2024-01-11" "2024-01-13" "2024-01-15" "2024-01-17" "2024-01-19"
## [11] "2024-01-21" "2024-01-23" "2024-01-25" "2024-01-27" "2024-01-29"

use different date format

as.Date("Jan-10-2018", format = "%b-%d-%Y")
## [1] "2018-01-10"
as.Date("January-10-2018", format = "%b-%d-%Y")
## [1] "2018-01-10"
dates <- c("02/27/92", "02/27/92", "01/14/92", "02/28/92", "02/01/92")
as.Date(dates, "%m/%d/%y")
## [1] "1992-02-27" "1992-02-27" "1992-01-14" "1992-02-28" "1992-02-01"

%Y = year in extended form: 2018 %y = year in short form: 18 %B = month in extended form: February %b = month in short form : Feb %m = month in numeric form : 2 %d = day of the month : 15 %j = month of the year : 2 %H = hour(24 hours) : 22 %I = hour (12 hours) : 10 %M = minutes %S = seconds

to specify time zone, use PoSIXct(), POSIXIt()

zaman1 <- "May 5, 2018, 12:57:10"
zaman1.format <- "%B %d, %Y, %H:%M:%S"
zaman1.ct <- as.POSIXct(zaman1, format=zaman1.format, tz="GMT")
zaman1.ct
## [1] "2018-05-05 12:57:10 GMT"
class(zaman1.ct)
## [1] "POSIXct" "POSIXt"
typeof(zaman1.ct)
## [1] "double"
mode(zaman1.ct)
## [1] "numeric"
zaman1.lt<- as.POSIXlt(zaman1.ct)
zaman1.lt
## [1] "2018-05-05 12:57:10 GMT"
class(zaman1.lt)
## [1] "POSIXlt" "POSIXt"
typeof(zaman1.lt)
## [1] "list"
mode(zaman1.lt)
## [1] "list"
unclass(zaman1.lt)
## $sec
## [1] 10
## 
## $min
## [1] 57
## 
## $hour
## [1] 12
## 
## $mday
## [1] 5
## 
## $mon
## [1] 4
## 
## $year
## [1] 118
## 
## $wday
## [1] 6
## 
## $yday
## [1] 124
## 
## $isdst
## [1] 0
## 
## attr(,"tzone")
## [1] "GMT"

parse date: read date from text

zaman2 <- c("15:10:00 20/03/2018", "16:00:35 19/01/2017",
            "10:20:30 05/12/2017", "12:15:15 30/06/2018",
            "11:20:35 21/04/2017", "15:50:00 11/09/2018")
zaman2_str <- strptime(zaman2, "%H:%M:%S %d/%m/%Y",tz = "UTC")
zaman2_str
## [1] "2018-03-20 15:10:00 UTC" "2017-01-19 16:00:35 UTC"
## [3] "2017-12-05 10:20:30 UTC" "2018-06-30 12:15:15 UTC"
## [5] "2017-04-21 11:20:35 UTC" "2018-09-11 15:50:00 UTC"
mode(zaman2_str)
## [1] "list"

for more library(lubridate)

6 Type conversions

test

is.numeric(),  is.character(),  is.vector(),  is.matrix(), is.data.frame(),
is.factor(), is.logical(), is.numeric()

convert

as.character(),  as.vector(), as.matrix(),  as.data.frame(),
as.factor(), as.logical()

more with libraries

7 Sorting data

mtcars <- mtcars[order(mtcars$mpg),]
head(mtcars)
mtcars1 <- mtcars[order(mtcars$mpg, mtcars$disp),] # 1st then by 2nd
head(mtcars1)
mtcars2 <- mtcars[order(mtcars$mpg,-mtcars$qsec),]  # - desc
head(mtcars2)

8 Merging datasets

x <- data.frame(k1 = c(NA,NA,3,4,5), k2 = c(1,NA,NA,4,5), data = 1:5)
y <- data.frame(k1 = c(NA,2,NA,4,5), k2 = c(NA,NA,3,4,5), data = 1:5)
  1. add column
cbind(x,y)

inner join

x
y
merge(x, y, by = "k1") # NA's match, so 6 rows
merge(x, y, by = "k2", incomparables = NA) # 2 rows
merge(x, y, by = c("k1","k2")) # NA's match

2.add row

rbind(x,y)

9 Subsetting datasets

dataframe[row indices, column indices]
a1 <- mtcars[, c(2,4)]
head(a1)
isim1 <- c("cyl", "hp", "qsec")
a2 <-mtcars[isim1]
head(a2)
a3 <- mtcars[c(-2,-4)]
head(a3)
a4 <- mtcars
a4$carb <- a4$wt <- NULL
head(a4)
a5 <- mtcars[1:3,]
head(a5)
a6<- mtcars[which(mtcars$cyl==6 & mtcars$disp == 160),]
head(a6)

10 SQL statements to manipulate data frames

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L
## '/Library/Frameworks/R.framework/Resources/library/tcltk/libs//tcltk.so'' had
## status 1
## Loading required package: RSQLite
## Warning: package 'RSQLite' was built under R version 4.2.3
newdf <- sqldf("select * from mtcars where carb=1 order by mpg",
               row.names=TRUE)
sqldf("select avg(mpg) as avg_mpg, avg(disp) as avg_disp, gear
      from mtcars where cyl in (4, 6) group by gear")
crashes <- read.csv("crashes.csv")
roads <- read.csv("roads.csv")
head(crashes)
print(roads)
##            Road       District Length
## 1 Interstate 65     Greenfield    262
## 2 Interstate 70      Vincennes    156
## 3         US-36 Crawfordsville    139
## 4         US-40     Greenfield    150
## 5         US-52 Crawfordsville    172
join_string <- "select crashes.* , roads.District, roads.Length
                            from crashes
                            left join roads
                            on crashes.Road = roads.Road"

join_string
## [1] "select crashes.* , roads.District, roads.Length\n                            from crashes\n                            left join roads\n                            on crashes.Road = roads.Road"
crashes_join_roads <- sqldf(join_string, stringsAsFactors = FALSE)
crashes_join_roads
join_string2 <- "select crashes.* , roads.District, roads.Length
                      from crashes
                      inner join roads
                      on crashes.Road = roads.Road"

crashes_join_roads2<- sqldf(join_string2, stringsAsFactors = FALSE)
head(crashes_join_roads2) 

The merge statement in base R can perform the equivalent of inner and left joins, as well as right and full outer joins, which are unavailable in sqldf.

crashes_merge_roads <- merge(crashes, roads, by = c("Road"))
crashes_merge_roads
crashes_merge_roads2 <- merge(crashes, roads, by = c("Road"), all.x = TRUE)
crashes_merge_roads2
crashes_merge_roads3 <- merge(crashes, roads, by = c("Road"), all.y = TRUE)
crashes_merge_roads3
crashes_merge_roads4 <- merge(crashes, roads, by = c("Road"), all.x = TRUE, 
                              all.y = TRUE)
crashes_merge_roads4

Modifying the inner join query to include a where is the equivalent of combining merge and subset statements.

join_string2 <- "select  crashes.* , roads.District, roads.Length
                  from crashes
                  inner join roads
                  on crashes.Road = roads.Road
                  where crashes.Road = 'US-40'"                

crashes_join_roads4 <- sqldf(join_string2,stringsAsFactors = FALSE)
crashes_join_roads4

Aggregate functions available using SQLite can be used through the use of a group by clause.

group_string <- "select crashes.Road, avg(crashes.N_Crashes) as Mean_Crashes
                  from crashes
                  left join roads
                  on crashes.Road = roads.Road
                  group by 1"
sqldf(group_string)

While sqldf can make certain data manipulation operations easier, more advanced data manipulation tasks and calculations must be performed in R, such as using Hadley Wickham’s plyr package.

plyr::ddply(crashes_merge_roads,
      c("Road"),
      function(X) data.frame(Mean_Crashes = mean(X$N_Crashes),
                             Q1_Crashes = quantile(X$N_Crashes, 0.25),
                             Q3_Crashes = quantile(X$N_Crashes, 0.75),
                             Median_Crashes = quantile(X$N_Crashes, 0.50))
)

11 strings

11.1 creating strings

c1 <- "Mustafa Akgül" 
c2 <- "Özgür Yazılım Kış Kampı"  
paste(c1, c2)
## [1] "Mustafa Akgül Özgür Yazılım Kış Kampı"
paste("The life of", pi)
## [1] "The life of 3.14159265358979"
paste("I", "love", "R")
## [1] "I love R"
paste("I", "love", "R", sep = "-")
## [1] "I-love-R"
paste0("I", "love", "R")
## [1] "IloveR"
paste("R", 1:5, sep = " v1.")
## [1] "R v1.1" "R v1.2" "R v1.3" "R v1.4" "R v1.5"
paste0("R", 1:5, sep = " v1.")
## [1] "R1 v1." "R2 v1." "R3 v1." "R4 v1." "R5 v1."

11.2 convert to string

c3<-pi
c3<-as.character(pi)
c3
## [1] "3.14159265358979"
toString(c("Aug", 24, 1980))
## [1] "Aug, 24, 1980"

11.3 printing strings

  • print(): generic printing
  • noquote(): print with no quotes
  • cat(): concatenate and print with no quotes
  • sprintf(): a wrapper for the C function sprintf, that returns a character vector containing a formatted combination of text and variable values
print(c2)
## [1] "Özgür Yazılım Kış Kampı"
print(c2, quote = FALSE)
## [1] Özgür Yazılım Kış Kampı
noquote(c2)
## [1] Özgür Yazılım Kış Kampı

Another very useful function is cat() which allows us to concatenate objects and print them either on screen or to a file. The output result is very similar to noquote(); however, cat() does not print the numeric line indicator. As a result, cat() can be useful for printing nicely formatted responses to users.

noquote(c2)
## [1] Özgür Yazılım Kış Kampı
cat(c2)
## Özgür Yazılım Kış Kampı
cat(c2, "2020")
## Özgür Yazılım Kış Kampı 2020
cat(letters)
## a b c d e f g h i j k l m n o p q r s t u v w x y z
cat(letters, sep = "-")
## a-b-c-d-e-f-g-h-i-j-k-l-m-n-o-p-q-r-s-t-u-v-w-x-y-z
cat(letters, sep = "")
## abcdefghijklmnopqrstuvwxyz

long strings

cat(c1,c2, fill=1)
## Mustafa Akgül 
## Özgür Yazılım Kış Kampı

sprintf() is a useful printing function for precise control of the output. It is a wrapper for the C function sprintf and returns a character vector containing a formatted combination of text and variable values.To substitute in a string or string variable, use %s:

c2
## [1] "Özgür Yazılım Kış Kampı"
sprintf("Mustafa Akgül %s 2020", c2)
## [1] "Mustafa Akgül Özgür Yazılım Kış Kampı 2020"
sprintf("Hoşgeldin %s %s 2020",c1,c2)
## [1] "Hoşgeldin Mustafa Akgül Özgür Yazılım Kış Kampı 2020"
r<-3
sprintf("Kullandığım R versiyonu:%d",r)
## [1] "Kullandığım R versiyonu:3"

print with leading spaces

r<-3
sprintf("Kullandığım R versiyonu:%4d",r)
## [1] "Kullandığım R versiyonu:   3"

can also lead with zeros

r<-3
sprintf("Kullandığım R versiyonu:%04d",r)
## [1] "Kullandığım R versiyonu:0003"

For floating-point numbers, use %f for standard notation, and %e or %E for exponential notation:

‘%f’ indicates ‘fixed point’ decimal notation

sprintf("%f", pi)
## [1] "3.141593"

decimal notation with 3 decimal digits

sprintf("%.3f", pi)
## [1] "3.142"

1 integer and 0 decimal digits

sprintf("%1.0f", pi)
## [1] "3"

decimal notation with 5 total decimal digits and only 1 to the right of the decimal point

sprintf("%5.1f", pi)
## [1] "  3.1"

fill empty digits with zeros

sprintf("%05.1f", pi)
## [1] "003.1"

print with sign (positive)

sprintf("%+f", pi)
## [1] "+3.141593"

prefix a space

sprintf("% f", pi)
## [1] " 3.141593"

exponential decimal notation ‘e’

sprintf("%e", pi)
## [1] "3.141593e+00"

exponential decimal notation ‘E’

sprintf("%E", pi)
## [1] "3.141593E+00"

11.4 counting strings

length("Bir berber bir berbere gel beraber bir berber dükkanı açalım demiş")
## [1] 1
length(c("Bir","berber","bir","berbere","gel","beraber","bir","berber", "dükkanı","açalım","demiş"))
## [1] 11
nchar("Bir berber bir berbere gel beraber bir berber dükkanı açalım demiş")
## [1] 66
nchar(c("Bir","berber","bir","berbere","gel","beraber","bir","berber", "dükkanı","açalım","demiş"))
##  [1] 3 6 3 7 3 7 3 6 7 6 5

#string manupilation with base R

To replace a character/s

x <- "This is A string."
chartr(old = "A", new = "a", x)
## [1] "This is a string."

replace any ‘d’ with ‘t’ and any ‘z’ with ‘a’

y <- "Tomorrow I plzn do lezrn zbout dexduzl znzlysis."
chartr(old = "dz", new = "ta", y)
## [1] "Tomorrow I plan to learn about textual analysis."

Note that chartr() replaces every identified letter for replacement so the only time I use it is when I am certain that I want to change every possible occurrence of a letter.

String Abbreviations

streets <- c("Main", "Elm", "Riverbend", "Mario", "Frederick")
# default abbreviations
abbreviate(streets)
##      Main       Elm Riverbend     Mario Frederick 
##    "Main"     "Elm"    "Rvrb"    "Mari"    "Frdr"
# set minimum length of abbreviation
abbreviate(streets, minlength = 2)
##      Main       Elm Riverbend     Mario Frederick 
##      "Mn"      "El"      "Rv"      "Mr"      "Fr"

Extract/Replace Substrings

To extract or replace substrings in a character vector there are three primary base R functions to use: substr(), substring(), and strsplit(). The purpose of substr() is to extract and replace substrings with specified starting and stopping characters:

alphabet <- paste(LETTERS, collapse = "")
# extract 18th character in string
substr(alphabet, start = 18, stop = 18)
## [1] "R"
# extract 18-24th characters in string
substr(alphabet, start = 18, stop = 24)
## [1] "RSTUVWX"
# replace 19-24th characters with `R`
substr(alphabet, start = 19, stop = 24) <- "RRRRRR"
alphabet
## [1] "ABCDEFGHIJKLMNOPQRRRRRRRYZ"

The purpose of substring() is to extract and replace substrings with only a specified starting point. substring() also allows you to extract/replace in a recursive fashion:

alphabet <- paste(LETTERS, collapse = "")
# extract 18th through last character
substring(alphabet, first = 18)
## [1] "RSTUVWXYZ"

recursive extraction; specify start position only

substring(alphabet, first = 18:24)
## [1] "RSTUVWXYZ" "STUVWXYZ"  "TUVWXYZ"   "UVWXYZ"    "VWXYZ"     "WXYZ"     
## [7] "XYZ"

recursive extraction; specify start and stop positions

 substring(alphabet, first = 1:5, last = 3:7)
## [1] "ABC" "BCD" "CDE" "DEF" "EFG"

To split the elements of a character string use strsplit():

z <- "The day after I will take a break and drink a beer."
strsplit(z, split = " ")
## [[1]]
##  [1] "The"   "day"   "after" "I"     "will"  "take"  "a"     "break" "and"  
## [10] "drink" "a"     "beer."
a <- "Alabama-Alaska-Arizona-Arkansas-California"
strsplit(a, split = "-")
## [[1]]
## [1] "Alabama"    "Alaska"     "Arizona"    "Arkansas"   "California"
unlist(strsplit(a, split = "-"))
## [1] "Alabama"    "Alaska"     "Arizona"    "Arkansas"   "California"

12 String Manipulation with stringr

There are three stringr functions that are closely related to their base R equivalents, but with a few enhancements:

  • Concatenate with str_c()
  • Number of characters with str_length()
  • Substring with str_sub()

str_c() is equivalent to the paste() functions:

same as paste0()

library(stringr)
## Warning: package 'stringr' was built under R version 4.2.3
str_c("Learning", "to", "use", "the", "stringr", "package")
## [1] "Learningtousethestringrpackage"

same as paste()

str_c("Learning", "to", "use", "the", "stringr", "package", sep = " ")
## [1] "Learning to use the stringr package"

allows recycling

str_c(letters, " is for", "...")
##  [1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..."
##  [6] "f is for..." "g is for..." "h is for..." "i is for..." "j is for..."
## [11] "k is for..." "l is for..." "m is for..." "n is for..." "o is for..."
## [16] "p is for..." "q is for..." "r is for..." "s is for..." "t is for..."
## [21] "u is for..." "v is for..." "w is for..." "x is for..." "y is for..."
## [26] "z is for..."

str_length() is similar to the nchar() function; however, str_ length() behaves more appropriately with missing (‘NA’) values:

some text with NA

text = c("Learning", "to", NA, "use", "the", NA, "stringr", "package")

compare str_length() with nchar()

nchar(text)
## [1]  8  2 NA  3  3 NA  7  7
str_length(text)
## [1]  8  2 NA  3  3 NA  7  7

str_sub() is similar to substr(); however, it returns a zero length vector if any of its inputs are zero length, and otherwise expands each argument to match the longest. It also accepts negative positions, which are calculated from the left of the last character.

x <- "Learning to use the stringr package"

alternative indexing

str_sub(x, start = 1, end = 15)
## [1] "Learning to use"
str_sub(x, end = 15)
## [1] "Learning to use"
str_sub(x, start = 17)
## [1] "the stringr package"
str_sub(x, start = c(1, 17), end = c(15, 35))
## [1] "Learning to use"     "the stringr package"

using negative indices for start/end points from end of string

str_sub(x, start = -1)
## [1] "e"
str_sub(x, start = -19)
## [1] "the stringr package"
str_sub(x, end = -21)
## [1] "Learning to use"

Replacement

str_sub(x, end = 15) <- "I know how to use"
x
## [1] "I know how to use the stringr package"

12.1 Duplicate Characters Within a String

str_dup("beer", times = 3)
## [1] "beerbeerbeer"
str_dup("beer", times = 1:3)
## [1] "beer"         "beerbeer"     "beerbeerbeer"

use with a vector of strings

states_i_luv <- state.name[c(6, 23, 34, 35)]
str_dup(states_i_luv, times = 2)
## [1] "ColoradoColorado"         "MinnesotaMinnesota"      
## [3] "North DakotaNorth Dakota" "OhioOhio"

Remove Leading and Trailing Whitespace

text <- c("Text ", " with", " whitespace ", " on", "both ", " sides ")

remove whitespaces on the left side

str_trim(text, side = "left")
## [1] "Text "       "with"        "whitespace " "on"          "both "      
## [6] "sides "

remove whitespaces on the right side

str_trim(text, side = "right")
## [1] "Text"        " with"       " whitespace" " on"         "both"       
## [6] " sides"

remove whitespaces on both sides

str_trim(text, side = "both")
## [1] "Text"       "with"       "whitespace" "on"         "both"      
## [6] "sides"

12.2 Pad a String with Whitespace

To add whitespace, or to pad a string, use str_pad(). You can also use str_ pad() to pad a string with specified characters.

str_pad("beer", width = 10, side = "left")
## [1] "      beer"
str_pad("beer", width = 10, side = "both")
## [1] "   beer   "
str_pad("beer", width = 10, side = "right", pad = "!")
## [1] "beer!!!!!!"

12.3 Set Operatons for Character Strings

Set Union

set_1 <- c("lagunitas", "bells", "dogfish", "summit", "odell")
set_2 <- c("sierra", "bells", "harpoon", "lagunitas", "founders")
union(set_1, set_2)
## [1] "lagunitas" "bells"     "dogfish"   "summit"    "odell"     "sierra"   
## [7] "harpoon"   "founders"

Set Intersection To obtain the common elements of two character vectors use intersect():

intersect(set_1, set_2)
## [1] "lagunitas" "bells"

12.4 Identifying Different Elements

To obtain the non-common elements, or the difference, of two character vectors use setdiff():

returns elements in set_1 not in set_2

setdiff(set_1, set_2)
## [1] "dogfish" "summit"  "odell"

returns elements in set_2 not in set_1

setdiff(set_2, set_1)
## [1] "sierra"   "harpoon"  "founders"

12.5 Testing for Element Equality

To test if two vectors contain the same elements regardless of order use setequal():

set_3 <- c("woody", "buzz", "rex")
set_4 <- c("woody", "andy", "buzz")
set_5 <- c("andy", "buzz", "woody")
setequal(set_3, set_4)
## [1] FALSE
setequal(set_4, set_5)
## [1] TRUE

12.6 Testing for Exact Equality

To test if two character vectors are equal in content and order use identical():

set_6 <- c("woody", "andy", "buzz")
set_7 <- c("andy", "buzz", "woody")
set_8 <- c("woody", "andy", "buzz")
identical(set_6, set_7)
## [1] FALSE
identical(set_6, set_8)
## [1] TRUE

Identifying If Elements Are Contained in a String To test if an element is contained within a character vector use is.element() or %in%:

good <- "andy"
bad <- "sid"
is.element(good, set_8)
## [1] TRUE
good %in% set_8
## [1] TRUE
bad %in% set_8
## [1] FALSE

12.7 Sorting a String

sort(set_8)
## [1] "andy"  "buzz"  "woody"
sort(set_8, decreasing = TRUE)
## [1] "woody" "buzz"  "andy"

13 Regular Expression

A regular expression (aka regex) is a sequence of characters that define a search pattern, mainly for use in pattern matching with text strings.

help(regex)

13.1 metacharacters